With the chosen dataset, I aim to find out if a bank customer’s loan would be approved or not. For this purpose, I will use classification algorithms (K-NN & Naive Bayes) and compare the results from both analyses and make an interpretation as to which one is accurate and suitable for this dataset.
DETAILS:
# LOADING THE DATASET:
Training_dataset <- read.csv("/Users/ajithrajperiyasamy/Desktop/FILES/KSU FILES/CAPSTONE/BANK LOAN/CODING/Training dataset.csv")
Testing_dataset <- read.csv("/Users/ajithrajperiyasamy/Desktop/FILES/KSU FILES/CAPSTONE/BANK LOAN/CODING/Testing dataset.csv")
head(Training_dataset) #Displays the first 6 values of each columns
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## 1 LP001002 Male No 0 Graduate No 5849
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1 0 NA 360 1 Urban
## 2 1508 128 360 1 Rural
## 3 0 66 360 1 Urban
## 4 2358 120 360 1 Urban
## 5 0 141 360 1 Urban
## 6 4196 267 360 1 Urban
## Loan_Status
## 1 Y
## 2 N
## 3 Y
## 4 Y
## 5 Y
## 6 Y
str(Training_dataset) #Displays the structure of the dataset.
## 'data.frame': 614 obs. of 13 variables:
## $ Loan_ID : chr "LP001002" "LP001003" "LP001005" "LP001006" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Married : chr "No" "Yes" "Yes" "Yes" ...
## $ Dependents : chr "0" "1" "0" "0" ...
## $ Education : chr "Graduate" "Graduate" "Graduate" "Not Graduate" ...
## $ Self_Employed : chr "No" "No" "Yes" "No" ...
## $ ApplicantIncome : int 5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
## $ CoapplicantIncome: num 0 1508 0 2358 0 ...
## $ LoanAmount : int NA 128 66 120 141 267 95 158 168 349 ...
## $ Loan_Amount_Term : int 360 360 360 360 360 360 360 360 360 360 ...
## $ Credit_History : int 1 1 1 1 1 1 1 0 1 1 ...
## $ Property_Area : chr "Urban" "Rural" "Urban" "Urban" ...
## $ Loan_Status : chr "Y" "N" "Y" "Y" ...
# Missing Value:
total_missing <- sum(is.na(Training_dataset))
total_missing
## [1] 86
total_cells <- nrow(Training_dataset)*ncol(Training_dataset)
total_cells
## [1] 7982
percent_missing <- (total_missing/total_cells)*100
percent_missing
## [1] 1.077424
print(paste("Percentage of missing values in the dataset:",percent_missing,"%")) # Since the percentage of missing values is just 1%, we dont need to impute the missing values, rather we can omit them, because they would not affect our results.
## [1] "Percentage of missing values in the dataset: 1.07742420446004 %"
training_dataset <- na.omit(Training_dataset)
# DESCRIPTIVE AND BASIC STATISTICS:
# 1.Summary Statistics:
summary(Training_dataset) # Summary gives us an idea about the mean, median, maximum and minimum value of all the variables belonging to the dataset.
## Loan_ID Gender Married Dependents
## Length:614 Length:614 Length:614 Length:614
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Education Self_Employed ApplicantIncome CoapplicantIncome
## Length:614 Length:614 Min. : 150 Min. : 0
## Class :character Class :character 1st Qu.: 2878 1st Qu.: 0
## Mode :character Mode :character Median : 3812 Median : 1188
## Mean : 5403 Mean : 1621
## 3rd Qu.: 5795 3rd Qu.: 2297
## Max. :81000 Max. :41667
##
## LoanAmount Loan_Amount_Term Credit_History Property_Area
## Min. : 9.0 Min. : 12 Min. :0.0000 Length:614
## 1st Qu.:100.0 1st Qu.:360 1st Qu.:1.0000 Class :character
## Median :128.0 Median :360 Median :1.0000 Mode :character
## Mean :146.4 Mean :342 Mean :0.8422
## 3rd Qu.:168.0 3rd Qu.:360 3rd Qu.:1.0000
## Max. :700.0 Max. :480 Max. :1.0000
## NA's :22 NA's :14 NA's :50
## Loan_Status
## Length:614
## Class :character
## Mode :character
##
##
##
##
# 2. Scatter plot:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Identify numeric columns
numeric_vars <- Training_dataset %>%
select_if(is.numeric) %>%
names()
numeric_vars
## [1] "ApplicantIncome" "CoapplicantIncome" "LoanAmount"
## [4] "Loan_Amount_Term" "Credit_History"
library(ggplot2)
# Pairwise scatter plots for numeric variables
for (i in 1:(length(numeric_vars)-1)) {
for (j in (i+1):length(numeric_vars)) {
p <- ggplot(Training_dataset, aes_string(x = numeric_vars[i], y = numeric_vars[j], color = "Loan_Status")) +
geom_point(alpha = 0.6) +
labs(title = paste(numeric_vars[i], "vs", numeric_vars[j]),
x = numeric_vars[i], y = numeric_vars[j]) +
theme_minimal()
print(p)
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 22 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 22 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 71 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 64 rows containing missing values or values outside the scale range
## (`geom_point()`).
# 3. BARPLOT:
barplot <- barplot(table(Training_dataset$Loan_Status),
main = "Loan Status",
xlab = "Decision",
ylab = "Frequency")
# 4. BOXPLOT:
# Box plot for Applicant Income vs Loan Status
ggplot(training_dataset, aes(x = Loan_Status, y = ApplicantIncome)) +
geom_boxplot() +
labs(title = "Box Plot of Applicant Income by Loan Status", x = "Loan Status", y = "ApplicantIncome")
# Box plot for Co Applicant Income vs Loan Status
ggplot(training_dataset, aes(x = Loan_Status, y = CoapplicantIncome)) +
geom_boxplot() +
labs(title = "Box Plot of Coapplicant Income by Loan Status", x = "Loan Status", y = "CoapplicantIncome")
# Box plot for Loan Amount vs Loan Status
ggplot(training_dataset, aes(x = Loan_Status, y = LoanAmount)) +
geom_boxplot() +
labs(title = "Box Plot of Loan Amount by Loan Status", x = "Loan Status", y = "LoanAmount")
# 5. HISTOGRAM:
library(ggplot2)
# Age distribution
ggplot(training_dataset, aes(x = ApplicantIncome)) +
geom_histogram(binwidth = 1000, fill="lightblue", color="black") +
labs(title="Applicant Income Distribution", x="ApplicantIncome", y="Count")
# Income distribution
ggplot(training_dataset, aes(x = CoapplicantIncome)) +
geom_histogram(binwidth = 1000, fill="lightgreen", color="black") +
labs(title="Co-applicant Income Distribution", x="CoapplicantIncome", y="Count")
# Loan amount distribution
ggplot(training_dataset, aes(x = LoanAmount)) +
geom_histogram(binwidth = 500, fill="lightcoral", color="black") +
labs(title="Loan Amount Distribution", x="LoanAmount", y="Count")
# Credit History distribution
ggplot(training_dataset, aes(x = Credit_History)) +
geom_histogram(binwidth = 10, fill="lightgoldenrod", color="black") +
labs(title="Credit History Distribution", x="Credit History", y="Count")
# CONVERTING CATEGORICAL VARIABLES TO NUMERIC BY ONE-HOT ENCODING:
library(caret)
## Loading required package: lattice
head(training_dataset)
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## 7 LP001013 Male Yes 0 Not Graduate No 2333
## CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 2 1508 128 360 1 Rural
## 3 0 66 360 1 Urban
## 4 2358 120 360 1 Urban
## 5 0 141 360 1 Urban
## 6 4196 267 360 1 Urban
## 7 1516 95 360 1 Urban
## Loan_Status
## 2 N
## 3 Y
## 4 Y
## 5 Y
## 6 Y
## 7 Y
dummy_gender <- dummyVars(~Gender, data=training_dataset)
dummy_Married <- dummyVars(~Married, data=training_dataset)
dummy_Education <- dummyVars(~Education, data=training_dataset)
dummy_Self_Employed <- dummyVars(~Self_Employed, data=training_dataset)
dummy_Property_Area <- dummyVars(~Property_Area, data=training_dataset)
dummy_Loan_Status<- dummyVars(~Loan_Status, data=training_dataset)
dummy_Dependents<- dummyVars(~Dependents, data=training_dataset)
encoded_training_dataset <- cbind(training_dataset,
predict(dummy_gender,training_dataset),
predict(dummy_Married,training_dataset),
predict(dummy_Education,training_dataset),
predict(dummy_Self_Employed,training_dataset),
predict(dummy_Property_Area,training_dataset),
predict(dummy_Loan_Status,training_dataset),
predict(dummy_Dependents,training_dataset))
head(encoded_training_dataset)
## Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome
## 2 LP001003 Male Yes 1 Graduate No 4583
## 3 LP001005 Male Yes 0 Graduate Yes 3000
## 4 LP001006 Male Yes 0 Not Graduate No 2583
## 5 LP001008 Male No 0 Graduate No 6000
## 6 LP001011 Male Yes 2 Graduate Yes 5417
## 7 LP001013 Male Yes 0 Not Graduate No 2333
## CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 2 1508 128 360 1 Rural
## 3 0 66 360 1 Urban
## 4 2358 120 360 1 Urban
## 5 0 141 360 1 Urban
## 6 4196 267 360 1 Urban
## 7 1516 95 360 1 Urban
## Loan_Status Gender GenderFemale GenderMale Married MarriedNo MarriedYes
## 2 N 0 0 1 0 0 1
## 3 Y 0 0 1 0 0 1
## 4 Y 0 0 1 0 0 1
## 5 Y 0 0 1 0 1 0
## 6 Y 0 0 1 0 0 1
## 7 Y 0 0 1 0 0 1
## EducationGraduate EducationNot Graduate Self_Employed Self_EmployedNo
## 2 1 0 0 1
## 3 1 0 0 0
## 4 0 1 0 1
## 5 1 0 0 1
## 6 1 0 0 0
## 7 0 1 0 1
## Self_EmployedYes Property_AreaRural Property_AreaSemiurban Property_AreaUrban
## 2 0 1 0 0
## 3 1 0 0 1
## 4 0 0 0 1
## 5 0 0 0 1
## 6 1 0 0 1
## 7 0 0 0 1
## Loan_StatusN Loan_StatusY Dependents Dependents0 Dependents1 Dependents2
## 2 1 0 0 0 1 0
## 3 0 1 0 1 0 0
## 4 0 1 0 1 0 0
## 5 0 1 0 1 0 0
## 6 0 1 0 0 0 1
## 7 0 1 0 1 0 0
## Dependents3+
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
encoded_training_dataset <- encoded_training_dataset[, -c(1,2,3,4,5,6,12,13)]
head(encoded_training_dataset)
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2 4583 1508 128 360 1
## 3 3000 0 66 360 1
## 4 2583 2358 120 360 1
## 5 6000 0 141 360 1
## 6 5417 4196 267 360 1
## 7 2333 1516 95 360 1
## Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2 0 0 1 0 0 1 1
## 3 0 0 1 0 0 1 1
## 4 0 0 1 0 0 1 0
## 5 0 0 1 0 1 0 1
## 6 0 0 1 0 0 1 1
## 7 0 0 1 0 0 1 0
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2 0 0 1 0
## 3 0 0 0 1
## 4 1 0 1 0
## 5 0 0 1 0
## 6 0 0 0 1
## 7 1 0 1 0
## Property_AreaRural Property_AreaSemiurban Property_AreaUrban Loan_StatusN
## 2 1 0 0 1
## 3 0 0 1 0
## 4 0 0 1 0
## 5 0 0 1 0
## 6 0 0 1 0
## 7 0 0 1 0
## Loan_StatusY Dependents Dependents0 Dependents1 Dependents2 Dependents3+
## 2 0 0 0 1 0 0
## 3 1 0 1 0 0 0
## 4 1 0 1 0 0 0
## 5 1 0 1 0 0 0
## 6 1 0 0 0 1 0
## 7 1 0 1 0 0 0
# NORMALIZING THE DATA:
summary(encoded_training_dataset)
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## Min. : 150 Min. : 0 Min. : 9.0 Min. : 36.0
## 1st Qu.: 2900 1st Qu.: 0 1st Qu.:100.0 1st Qu.:360.0
## Median : 3816 Median : 1086 Median :128.0 Median :360.0
## Mean : 5508 Mean : 1542 Mean :145.9 Mean :342.4
## 3rd Qu.: 5815 3rd Qu.: 2232 3rd Qu.:167.0 3rd Qu.:360.0
## Max. :81000 Max. :33837 Max. :700.0 Max. :480.0
## Credit_History Gender GenderFemale GenderMale
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :1.0000 Median :0.00000 Median :0.0000 Median :1.0000
## Mean :0.8507 Mean :0.02268 Mean :0.1796 Mean :0.7977
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Married MarriedNo MarriedYes EducationGraduate
## Min. :0.000000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.000000 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :0.003781 Mean :0.3554 Mean :0.6408 Mean :0.7958
## 3rd Qu.:0.000000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :1.0000 Median :0.0000
## Mean :0.2042 Mean :0.04726 Mean :0.8204 Mean :0.1323
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Property_AreaRural Property_AreaSemiurban Property_AreaUrban Loan_StatusN
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.293 Mean :0.3951 Mean :0.3119 Mean :0.3081
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Loan_StatusY Dependents Dependents0 Dependents1
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :1.0000 Median :0.0000
## Mean :0.6919 Mean :0.02268 Mean :0.5577 Mean :0.1607
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Dependents2 Dependents3+
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000
## Mean :0.1739 Mean :0.08507
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
encoded_training_dataset_norm <- preProcess(encoded_training_dataset,method = c('range'))
normalized_training_dataset <- predict(encoded_training_dataset_norm,encoded_training_dataset)
head(normalized_training_dataset)
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2 0.05482993 0.04456660 0.17221418 0.7297297 1
## 3 0.03525046 0.00000000 0.08248915 0.7297297 1
## 4 0.03009276 0.06968703 0.16063676 0.7297297 1
## 5 0.07235622 0.00000000 0.19102750 0.7297297 1
## 6 0.06514533 0.12400627 0.37337192 0.7297297 1
## 7 0.02700062 0.04480303 0.12445731 0.7297297 1
## Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2 0 0 1 0 0 1 1
## 3 0 0 1 0 0 1 1
## 4 0 0 1 0 0 1 0
## 5 0 0 1 0 1 0 1
## 6 0 0 1 0 0 1 1
## 7 0 0 1 0 0 1 0
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2 0 0 1 0
## 3 0 0 0 1
## 4 1 0 1 0
## 5 0 0 1 0
## 6 0 0 0 1
## 7 1 0 1 0
## Property_AreaRural Property_AreaSemiurban Property_AreaUrban Loan_StatusN
## 2 1 0 0 1
## 3 0 0 1 0
## 4 0 0 1 0
## 5 0 0 1 0
## 6 0 0 1 0
## 7 0 0 1 0
## Loan_StatusY Dependents Dependents0 Dependents1 Dependents2 Dependents3+
## 2 0 0 0 1 0 0
## 3 1 0 1 0 0 0
## 4 1 0 1 0 0 0
## 5 1 0 1 0 0 0
## 6 1 0 0 0 1 0
## 7 1 0 1 0 0 0
summary(normalized_training_dataset)
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.03401 1st Qu.:0.00000 1st Qu.:0.1317 1st Qu.:0.7297
## Median :0.04534 Median :0.03209 Median :0.1722 Median :0.7297
## Mean :0.06627 Mean :0.04558 Mean :0.1981 Mean :0.6900
## 3rd Qu.:0.07007 3rd Qu.:0.06596 3rd Qu.:0.2287 3rd Qu.:0.7297
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Credit_History Gender GenderFemale GenderMale
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :1.0000 Median :0.00000 Median :0.0000 Median :1.0000
## Mean :0.8507 Mean :0.02268 Mean :0.1796 Mean :0.7977
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Married MarriedNo MarriedYes EducationGraduate
## Min. :0.000000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.000000 Median :0.0000 Median :1.0000 Median :1.0000
## Mean :0.003781 Mean :0.3554 Mean :0.6408 Mean :0.7958
## 3rd Qu.:0.000000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :1.0000 Median :0.0000
## Mean :0.2042 Mean :0.04726 Mean :0.8204 Mean :0.1323
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Property_AreaRural Property_AreaSemiurban Property_AreaUrban Loan_StatusN
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.293 Mean :0.3951 Mean :0.3119 Mean :0.3081
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Loan_StatusY Dependents Dependents0 Dependents1
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :1.0000 Median :0.0000
## Mean :0.6919 Mean :0.02268 Mean :0.5577 Mean :0.1607
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## Dependents2 Dependents3+
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000
## Mean :0.1739 Mean :0.08507
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
# FEATURE SELECTION (VARIABLE SELECTION)
# 1. Corrplot:
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(normalized_training_dataset[,sapply(normalized_training_dataset,is.numeric)],use = "complete.obs")
corrplot(cor_matrix, method = "circle",tl.cex=0.7)
# 2. PCA FOR ALL VARIABLES:
library(FactoMineR)
PCA(normalized_training_dataset)
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 529 individuals, described by 26 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
normalized_training_dataset$Loan_Status<- ifelse(normalized_training_dataset$Loan_StatusN == 1, 0, 1)
# Drop the original Loan Status columns:
normalized_training_dataset <-normalized_training_dataset[, -c(20,21)]
# Checking names of columns to ensure changes have been made:
colnames(normalized_training_dataset)
## [1] "ApplicantIncome" "CoapplicantIncome" "LoanAmount"
## [4] "Loan_Amount_Term" "Credit_History" "Gender"
## [7] "GenderFemale" "GenderMale" "Married"
## [10] "MarriedNo" "MarriedYes" "EducationGraduate"
## [13] "EducationNot Graduate" "Self_Employed" "Self_EmployedNo"
## [16] "Self_EmployedYes" "Property_AreaRural" "Property_AreaSemiurban"
## [19] "Property_AreaUrban" "Dependents" "Dependents0"
## [22] "Dependents1" "Dependents2" "Dependents3+"
## [25] "Loan_Status"
head(normalized_training_dataset)
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2 0.05482993 0.04456660 0.17221418 0.7297297 1
## 3 0.03525046 0.00000000 0.08248915 0.7297297 1
## 4 0.03009276 0.06968703 0.16063676 0.7297297 1
## 5 0.07235622 0.00000000 0.19102750 0.7297297 1
## 6 0.06514533 0.12400627 0.37337192 0.7297297 1
## 7 0.02700062 0.04480303 0.12445731 0.7297297 1
## Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2 0 0 1 0 0 1 1
## 3 0 0 1 0 0 1 1
## 4 0 0 1 0 0 1 0
## 5 0 0 1 0 1 0 1
## 6 0 0 1 0 0 1 1
## 7 0 0 1 0 0 1 0
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2 0 0 1 0
## 3 0 0 0 1
## 4 1 0 1 0
## 5 0 0 1 0
## 6 0 0 0 1
## 7 1 0 1 0
## Property_AreaRural Property_AreaSemiurban Property_AreaUrban Dependents
## 2 1 0 0 0
## 3 0 0 1 0
## 4 0 0 1 0
## 5 0 0 1 0
## 6 0 0 1 0
## 7 0 0 1 0
## Dependents0 Dependents1 Dependents2 Dependents3+ Loan_Status
## 2 0 1 0 0 0
## 3 1 0 0 0 1
## 4 1 0 0 0 1
## 5 1 0 0 0 1
## 6 0 0 1 0 1
## 7 1 0 0 0 1
# 3. Stepwise Regression:
# Load necessary library
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Fit initial linear regression model with all predictors
initial_model <- lm(Loan_Status ~ ., data = normalized_training_dataset)
# Perform backward elimination for variable selection
final_model <- step(initial_model, direction = "backward")
## Start: AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate +
## `EducationNot Graduate` + Self_Employed + Self_EmployedNo +
## Self_EmployedYes + Property_AreaRural + Property_AreaSemiurban +
## Property_AreaUrban + Dependents + Dependents0 + Dependents1 +
## Dependents2 + `Dependents3+`
##
##
## Step: AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate +
## `EducationNot Graduate` + Self_Employed + Self_EmployedNo +
## Self_EmployedYes + Property_AreaRural + Property_AreaSemiurban +
## Property_AreaUrban + Dependents + Dependents0 + Dependents1 +
## Dependents2
##
##
## Step: AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate +
## `EducationNot Graduate` + Self_Employed + Self_EmployedNo +
## Self_EmployedYes + Property_AreaRural + Property_AreaSemiurban +
## Dependents + Dependents0 + Dependents1 + Dependents2
##
##
## Step: AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate +
## `EducationNot Graduate` + Self_Employed + Self_EmployedNo +
## Property_AreaRural + Property_AreaSemiurban + Dependents +
## Dependents0 + Dependents1 + Dependents2
##
##
## Step: AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate +
## Self_Employed + Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban +
## Dependents + Dependents0 + Dependents1 + Dependents2
##
##
## Step: AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## GenderMale + Married + MarriedNo + EducationGraduate + Self_Employed +
## Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban +
## Dependents + Dependents0 + Dependents1 + Dependents2
##
##
## Step: AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## Married + MarriedNo + EducationGraduate + Self_Employed +
## Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban +
## Dependents + Dependents0 + Dependents1 + Dependents2
##
## Df Sum of Sq RSS AIC
## - Dependents2 1 0.0036 74.323 -1002.20
## - Self_EmployedNo 1 0.0201 74.339 -1002.08
## - ApplicantIncome 1 0.0302 74.349 -1002.01
## - Gender 1 0.0404 74.359 -1001.94
## - Dependents 1 0.0408 74.360 -1001.94
## - Married 1 0.0502 74.369 -1001.87
## - Dependents0 1 0.0834 74.402 -1001.63
## - Loan_Amount_Term 1 0.1048 74.424 -1001.48
## - Property_AreaRural 1 0.1360 74.455 -1001.26
## - GenderFemale 1 0.1408 74.460 -1001.23
## - CoapplicantIncome 1 0.1746 74.494 -1000.98
## - LoanAmount 1 0.2009 74.520 -1000.80
## - Dependents1 1 0.2193 74.538 -1000.67
## - Self_Employed 1 0.2261 74.545 -1000.62
## <none> 74.319 -1000.23
## - EducationGraduate 1 0.3005 74.620 -1000.09
## - MarriedNo 1 0.5270 74.846 -998.49
## - Property_AreaSemiurban 1 1.0786 75.398 -994.60
## - Credit_History 1 31.1879 105.507 -816.86
##
## Step: AIC=-1002.2
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## Married + MarriedNo + EducationGraduate + Self_Employed +
## Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban +
## Dependents + Dependents0 + Dependents1
##
## Df Sum of Sq RSS AIC
## - Self_EmployedNo 1 0.0209 74.344 -1004.05
## - ApplicantIncome 1 0.0328 74.356 -1003.97
## - Dependents 1 0.0374 74.360 -1003.93
## - Gender 1 0.0393 74.362 -1003.92
## - Married 1 0.0504 74.373 -1003.84
## - Loan_Amount_Term 1 0.1071 74.430 -1003.44
## - Property_AreaRural 1 0.1333 74.456 -1003.25
## - Dependents0 1 0.1341 74.457 -1003.25
## - GenderFemale 1 0.1419 74.465 -1003.19
## - CoapplicantIncome 1 0.1763 74.499 -1002.95
## - LoanAmount 1 0.1992 74.522 -1002.78
## - Self_Employed 1 0.2253 74.548 -1002.60
## <none> 74.323 -1002.20
## - EducationGraduate 1 0.2995 74.622 -1002.07
## - Dependents1 1 0.3320 74.655 -1001.84
## - MarriedNo 1 0.5249 74.848 -1000.48
## - Property_AreaSemiurban 1 1.0882 75.411 -996.51
## - Credit_History 1 31.1959 105.519 -818.80
##
## Step: AIC=-1004.05
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount +
## Loan_Amount_Term + Credit_History + Gender + GenderFemale +
## Married + MarriedNo + EducationGraduate + Self_Employed +
## Property_AreaRural + Property_AreaSemiurban + Dependents +
## Dependents0 + Dependents1
##
## Df Sum of Sq RSS AIC
## - ApplicantIncome 1 0.0288 74.372 -1005.85
## - Dependents 1 0.0367 74.380 -1005.79
## - Gender 1 0.0414 74.385 -1005.76
## - Married 1 0.0512 74.395 -1005.69
## - Loan_Amount_Term 1 0.1044 74.448 -1005.31
## - Dependents0 1 0.1307 74.474 -1005.12
## - Property_AreaRural 1 0.1389 74.483 -1005.06
## - GenderFemale 1 0.1435 74.487 -1005.03
## - CoapplicantIncome 1 0.1782 74.522 -1004.78
## - LoanAmount 1 0.2035 74.547 -1004.61
## - Self_Employed 1 0.2128 74.556 -1004.54
## <none> 74.344 -1004.05
## - EducationGraduate 1 0.3020 74.646 -1003.91
## - Dependents1 1 0.3422 74.686 -1003.62
## - MarriedNo 1 0.5287 74.872 -1002.30
## - Property_AreaSemiurban 1 1.0850 75.429 -998.39
## - Credit_History 1 31.1889 105.533 -820.73
##
## Step: AIC=-1005.85
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + Gender + GenderFemale + Married + MarriedNo +
## EducationGraduate + Self_Employed + Property_AreaRural +
## Property_AreaSemiurban + Dependents + Dependents0 + Dependents1
##
## Df Sum of Sq RSS AIC
## - Gender 1 0.0371 74.410 -1007.58
## - Dependents 1 0.0384 74.411 -1007.57
## - Married 1 0.0504 74.423 -1007.49
## - Loan_Amount_Term 1 0.1162 74.489 -1007.02
## - Dependents0 1 0.1314 74.504 -1006.91
## - Property_AreaRural 1 0.1446 74.517 -1006.82
## - GenderFemale 1 0.1478 74.520 -1006.80
## - LoanAmount 1 0.1881 74.561 -1006.51
## - Self_Employed 1 0.2195 74.592 -1006.29
## - CoapplicantIncome 1 0.2341 74.607 -1006.18
## <none> 74.372 -1005.85
## - EducationGraduate 1 0.3143 74.687 -1005.62
## - Dependents1 1 0.3437 74.716 -1005.41
## - MarriedNo 1 0.5185 74.891 -1004.17
## - Property_AreaSemiurban 1 1.0728 75.445 -1000.27
## - Credit_History 1 31.1615 105.534 -822.72
##
## Step: AIC=-1007.58
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + GenderFemale + Married + MarriedNo + EducationGraduate +
## Self_Employed + Property_AreaRural + Property_AreaSemiurban +
## Dependents + Dependents0 + Dependents1
##
## Df Sum of Sq RSS AIC
## - Dependents 1 0.0363 74.446 -1009.32
## - Married 1 0.0506 74.460 -1009.22
## - Loan_Amount_Term 1 0.1149 74.525 -1008.77
## - Dependents0 1 0.1294 74.539 -1008.66
## - GenderFemale 1 0.1394 74.549 -1008.59
## - Property_AreaRural 1 0.1460 74.556 -1008.55
## - LoanAmount 1 0.2095 74.619 -1008.10
## - Self_Employed 1 0.2248 74.634 -1007.99
## - CoapplicantIncome 1 0.2254 74.635 -1007.98
## <none> 74.410 -1007.58
## - EducationGraduate 1 0.3071 74.717 -1007.40
## - Dependents1 1 0.3347 74.744 -1007.21
## - MarriedNo 1 0.5198 74.929 -1005.90
## - Property_AreaSemiurban 1 1.0663 75.476 -1002.06
## - Credit_History 1 31.3212 105.731 -823.74
##
## Step: AIC=-1009.32
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + GenderFemale + Married + MarriedNo + EducationGraduate +
## Self_Employed + Property_AreaRural + Property_AreaSemiurban +
## Dependents0 + Dependents1
##
## Df Sum of Sq RSS AIC
## - Married 1 0.026 74.472 -1011.14
## - Dependents0 1 0.106 74.551 -1010.57
## - Loan_Amount_Term 1 0.120 74.566 -1010.47
## - Property_AreaRural 1 0.139 74.585 -1010.34
## - GenderFemale 1 0.143 74.589 -1010.31
## - LoanAmount 1 0.204 74.650 -1009.88
## - CoapplicantIncome 1 0.224 74.670 -1009.73
## - Self_Employed 1 0.229 74.675 -1009.70
## <none> 74.446 -1009.32
## - Dependents1 1 0.308 74.754 -1009.14
## - EducationGraduate 1 0.319 74.765 -1009.06
## - MarriedNo 1 0.542 74.988 -1007.49
## - Property_AreaSemiurban 1 1.086 75.531 -1003.67
## - Credit_History 1 32.034 106.480 -822.01
##
## Step: AIC=-1011.14
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term +
## Credit_History + GenderFemale + MarriedNo + EducationGraduate +
## Self_Employed + Property_AreaRural + Property_AreaSemiurban +
## Dependents0 + Dependents1
##
## Df Sum of Sq RSS AIC
## - Loan_Amount_Term 1 0.112 74.584 -1012.34
## - Dependents0 1 0.115 74.587 -1012.32
## - Property_AreaRural 1 0.144 74.616 -1012.11
## - GenderFemale 1 0.145 74.617 -1012.11
## - LoanAmount 1 0.204 74.677 -1011.69
## - Self_Employed 1 0.228 74.700 -1011.52
## - CoapplicantIncome 1 0.230 74.702 -1011.51
## <none> 74.472 -1011.14
## - Dependents1 1 0.321 74.793 -1010.86
## - EducationGraduate 1 0.324 74.797 -1010.84
## - MarriedNo 1 0.546 75.019 -1009.27
## - Property_AreaSemiurban 1 1.082 75.554 -1005.51
## - Credit_History 1 32.111 106.583 -823.49
##
## Step: AIC=-1012.34
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History +
## GenderFemale + MarriedNo + EducationGraduate + Self_Employed +
## Property_AreaRural + Property_AreaSemiurban + Dependents0 +
## Dependents1
##
## Df Sum of Sq RSS AIC
## - Dependents0 1 0.119 74.703 -1013.50
## - GenderFemale 1 0.160 74.744 -1013.21
## - Property_AreaRural 1 0.161 74.745 -1013.20
## - LoanAmount 1 0.214 74.798 -1012.83
## - CoapplicantIncome 1 0.231 74.815 -1012.71
## - Self_Employed 1 0.235 74.819 -1012.68
## <none> 74.584 -1012.34
## - Dependents1 1 0.292 74.876 -1012.28
## - EducationGraduate 1 0.298 74.882 -1012.23
## - MarriedNo 1 0.577 75.161 -1010.27
## - Property_AreaSemiurban 1 1.040 75.624 -1007.02
## - Credit_History 1 32.088 106.672 -825.05
##
## Step: AIC=-1013.5
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History +
## GenderFemale + MarriedNo + EducationGraduate + Self_Employed +
## Property_AreaRural + Property_AreaSemiurban + Dependents1
##
## Df Sum of Sq RSS AIC
## - Property_AreaRural 1 0.177 74.880 -1014.25
## - GenderFemale 1 0.181 74.884 -1014.22
## - LoanAmount 1 0.182 74.885 -1014.21
## - Dependents1 1 0.183 74.885 -1014.21
## - Self_Employed 1 0.238 74.940 -1013.82
## - CoapplicantIncome 1 0.257 74.959 -1013.69
## <none> 74.703 -1013.50
## - EducationGraduate 1 0.284 74.986 -1013.50
## - MarriedNo 1 0.840 75.543 -1009.59
## - Property_AreaSemiurban 1 1.008 75.710 -1008.41
## - Credit_History 1 31.969 106.672 -827.05
##
## Step: AIC=-1014.25
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History +
## GenderFemale + MarriedNo + EducationGraduate + Self_Employed +
## Property_AreaSemiurban + Dependents1
##
## Df Sum of Sq RSS AIC
## - Dependents1 1 0.145 75.025 -1015.23
## - GenderFemale 1 0.178 75.058 -1014.99
## - Self_Employed 1 0.218 75.098 -1014.71
## - LoanAmount 1 0.222 75.102 -1014.69
## - CoapplicantIncome 1 0.256 75.136 -1014.45
## <none> 74.880 -1014.25
## - EducationGraduate 1 0.330 75.210 -1013.93
## - MarriedNo 1 0.856 75.736 -1010.24
## - Property_AreaSemiurban 1 2.047 76.927 -1001.99
## - Credit_History 1 31.930 106.810 -828.37
##
## Step: AIC=-1015.23
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History +
## GenderFemale + MarriedNo + EducationGraduate + Self_Employed +
## Property_AreaSemiurban
##
## Df Sum of Sq RSS AIC
## - GenderFemale 1 0.204 75.228 -1015.79
## - Self_Employed 1 0.217 75.242 -1015.70
## - LoanAmount 1 0.242 75.267 -1015.52
## - CoapplicantIncome 1 0.243 75.268 -1015.52
## <none> 75.025 -1015.23
## - EducationGraduate 1 0.327 75.351 -1014.93
## - MarriedNo 1 0.775 75.799 -1011.79
## - Property_AreaSemiurban 1 2.053 77.078 -1002.95
## - Credit_History 1 31.892 106.917 -829.84
##
## Step: AIC=-1015.79
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History +
## MarriedNo + EducationGraduate + Self_Employed + Property_AreaSemiurban
##
## Df Sum of Sq RSS AIC
## - CoapplicantIncome 1 0.198 75.426 -1016.41
## - Self_Employed 1 0.200 75.428 -1016.39
## - LoanAmount 1 0.224 75.452 -1016.22
## <none> 75.228 -1015.79
## - EducationGraduate 1 0.295 75.524 -1015.72
## - MarriedNo 1 1.191 76.419 -1009.49
## - Property_AreaSemiurban 1 1.927 77.155 -1004.42
## - Credit_History 1 32.011 107.240 -830.24
##
## Step: AIC=-1016.41
## Loan_Status ~ LoanAmount + Credit_History + MarriedNo + EducationGraduate +
## Self_Employed + Property_AreaSemiurban
##
## Df Sum of Sq RSS AIC
## - Self_Employed 1 0.208 75.634 -1017.0
## - EducationGraduate 1 0.277 75.703 -1016.5
## <none> 75.426 -1016.4
## - LoanAmount 1 0.289 75.715 -1016.4
## - MarriedNo 1 1.113 76.539 -1010.7
## - Property_AreaSemiurban 1 1.944 77.370 -1004.9
## - Credit_History 1 32.066 107.492 -831.0
##
## Step: AIC=-1016.95
## Loan_Status ~ LoanAmount + Credit_History + MarriedNo + EducationGraduate +
## Property_AreaSemiurban
##
## Df Sum of Sq RSS AIC
## - EducationGraduate 1 0.265 75.900 -1017.09
## <none> 75.634 -1016.95
## - LoanAmount 1 0.287 75.921 -1016.94
## - MarriedNo 1 1.061 76.695 -1011.58
## - Property_AreaSemiurban 1 1.924 77.558 -1005.66
## - Credit_History 1 32.598 108.232 -829.37
##
## Step: AIC=-1017.09
## Loan_Status ~ LoanAmount + Credit_History + MarriedNo + Property_AreaSemiurban
##
## Df Sum of Sq RSS AIC
## - LoanAmount 1 0.204 76.104 -1017.67
## <none> 75.900 -1017.09
## - MarriedNo 1 1.015 76.915 -1012.06
## - Property_AreaSemiurban 1 1.976 77.875 -1005.50
## - Credit_History 1 33.153 109.053 -827.37
##
## Step: AIC=-1017.67
## Loan_Status ~ Credit_History + MarriedNo + Property_AreaSemiurban
##
## Df Sum of Sq RSS AIC
## <none> 76.104 -1017.67
## - MarriedNo 1 0.890 76.994 -1013.52
## - Property_AreaSemiurban 1 1.978 78.082 -1006.10
## - Credit_History 1 33.272 109.376 -827.81
# Print the final model
summary(final_model)
##
## Call:
## lm(formula = Loan_Status ~ Credit_History + MarriedNo + Property_AreaSemiurban,
## data = normalized_training_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90313 -0.07414 0.09687 0.22199 1.01157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07414 0.04638 1.598 0.110547
## Credit_History 0.70387 0.04646 15.150 < 2e-16 ***
## MarriedNo -0.08571 0.03459 -2.478 0.013526 *
## Property_AreaSemiurban 0.12512 0.03387 3.694 0.000244 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3807 on 525 degrees of freedom
## Multiple R-squared: 0.3252, Adjusted R-squared: 0.3213
## F-statistic: 84.32 on 3 and 525 DF, p-value: < 2.2e-16
# DROPPING UNWANTED VARIABLES [COLUMNS]
library(class)
head(normalized_training_dataset)
## ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2 0.05482993 0.04456660 0.17221418 0.7297297 1
## 3 0.03525046 0.00000000 0.08248915 0.7297297 1
## 4 0.03009276 0.06968703 0.16063676 0.7297297 1
## 5 0.07235622 0.00000000 0.19102750 0.7297297 1
## 6 0.06514533 0.12400627 0.37337192 0.7297297 1
## 7 0.02700062 0.04480303 0.12445731 0.7297297 1
## Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2 0 0 1 0 0 1 1
## 3 0 0 1 0 0 1 1
## 4 0 0 1 0 0 1 0
## 5 0 0 1 0 1 0 1
## 6 0 0 1 0 0 1 1
## 7 0 0 1 0 0 1 0
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2 0 0 1 0
## 3 0 0 0 1
## 4 1 0 1 0
## 5 0 0 1 0
## 6 0 0 0 1
## 7 1 0 1 0
## Property_AreaRural Property_AreaSemiurban Property_AreaUrban Dependents
## 2 1 0 0 0
## 3 0 0 1 0
## 4 0 0 1 0
## 5 0 0 1 0
## 6 0 0 1 0
## 7 0 0 1 0
## Dependents0 Dependents1 Dependents2 Dependents3+ Loan_Status
## 2 0 1 0 0 0
## 3 1 0 0 0 1
## 4 1 0 0 0 1
## 5 1 0 0 0 1
## 6 0 0 1 0 1
## 7 1 0 0 0 1
normalized_class_training_dataset <-normalized_training_dataset[, -c(2,3,4,6,7,8,9,14,17,18,19,20,21,22,23,24)] # Removing Variables and Columns that were not significant enough and choosing only variables required for proceeding with our analysis.
head(normalized_class_training_dataset)
## ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 2 0.05482993 1 0 1 1
## 3 0.03525046 1 0 1 1
## 4 0.03009276 1 0 1 0
## 5 0.07235622 1 1 0 1
## 6 0.06514533 1 0 1 1
## 7 0.02700062 1 0 1 0
## EducationNot Graduate Self_EmployedNo Self_EmployedYes Loan_Status
## 2 0 1 0 0
## 3 0 0 1 1
## 4 1 1 0 1
## 5 0 1 0 1
## 6 0 0 1 1
## 7 1 1 0 1
# SELECTED VARIABLE PLOTS:
# 1. PCA FOR SELECTED VARIABLES:
head(normalized_class_training_dataset)
## ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 2 0.05482993 1 0 1 1
## 3 0.03525046 1 0 1 1
## 4 0.03009276 1 0 1 0
## 5 0.07235622 1 1 0 1
## 6 0.06514533 1 0 1 1
## 7 0.02700062 1 0 1 0
## EducationNot Graduate Self_EmployedNo Self_EmployedYes Loan_Status
## 2 0 1 0 0
## 3 0 0 1 1
## 4 1 1 0 1
## 5 0 1 0 1
## 6 0 0 1 1
## 7 1 1 0 1
library(FactoMineR)
PCA(normalized_class_training_dataset)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 529 individuals, described by 9 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
# 2. Pair Matrix for selected variables:
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(normalized_class_training_dataset[1:9],gap=0,bg=c("red","yellow","blue")[normalized_class_training_dataset$Loan_Status],pch=21)
library(caret)
# Determining optimum 'k' value:
# 1. Tuning 'k':
colnames(normalized_class_training_dataset)
## [1] "ApplicantIncome" "Credit_History" "MarriedNo"
## [4] "MarriedYes" "EducationGraduate" "EducationNot Graduate"
## [7] "Self_EmployedNo" "Self_EmployedYes" "Loan_Status"
model <- train(Loan_Status~`ApplicantIncome`+`Credit_History`+`MarriedNo`+`MarriedYes`+`EducationGraduate`+`EducationNot Graduate`+`Self_EmployedNo`+`Self_EmployedYes`, data=normalized_class_training_dataset, method="knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
model
## k-Nearest Neighbors
##
## 529 samples
## 8 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 529, 529, 529, 529, 529, 529, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.4471036 0.1726820 0.2947962
## 7 0.4356280 0.1824096 0.3004809
## 9 0.4300371 0.1879652 0.3049117
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
# Testing:
colnames(Testing_dataset)
## [1] "Loan_ID" "Gender" "Married"
## [4] "Dependents" "Education" "Self_Employed"
## [7] "ApplicantIncome" "CoapplicantIncome" "LoanAmount"
## [10] "Loan_Amount_Term" "Credit_History" "Property_Area"
Testing_dataset <- Testing_dataset[, -c(1,2,4,8,9,10,12)]
head(Testing_dataset)
## Married Education Self_Employed ApplicantIncome Credit_History
## 1 Yes Graduate No 5720 1
## 2 Yes Graduate No 3076 1
## 3 Yes Graduate No 5000 1
## 4 Yes Graduate No 2340 NA
## 5 No Not Graduate No 3276 1
## 6 Yes Not Graduate Yes 2165 1
# CONVERTING CATEGORICAL VARIABLES TO NUMERIC BY ONE-HOT ENCODING IN TESTING DATASET:
library(caret)
head(Testing_dataset)
## Married Education Self_Employed ApplicantIncome Credit_History
## 1 Yes Graduate No 5720 1
## 2 Yes Graduate No 3076 1
## 3 Yes Graduate No 5000 1
## 4 Yes Graduate No 2340 NA
## 5 No Not Graduate No 3276 1
## 6 Yes Not Graduate Yes 2165 1
dummy_Married_Test <- dummyVars(~Married, data=Testing_dataset)
dummy_Education_Test <- dummyVars(~Education, data=Testing_dataset)
dummy_Self_Employed_Test <- dummyVars(~Self_Employed, data=Testing_dataset)
encoded_Testing_dataset <- cbind(Testing_dataset,
predict(dummy_Married_Test,Testing_dataset),
predict(dummy_Education_Test,Testing_dataset),
predict(dummy_Self_Employed_Test,Testing_dataset))
head(encoded_Testing_dataset)
## Married Education Self_Employed ApplicantIncome Credit_History MarriedNo
## 1 Yes Graduate No 5720 1 0
## 2 Yes Graduate No 3076 1 0
## 3 Yes Graduate No 5000 1 0
## 4 Yes Graduate No 2340 NA 0
## 5 No Not Graduate No 3276 1 1
## 6 Yes Not Graduate Yes 2165 1 0
## MarriedYes EducationGraduate EducationNot Graduate Self_Employed
## 1 1 1 0 0
## 2 1 1 0 0
## 3 1 1 0 0
## 4 1 1 0 0
## 5 0 0 1 0
## 6 1 0 1 0
## Self_EmployedNo Self_EmployedYes
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 1 0
## 6 0 1
# REMOVE ENCODED COLUMNS:
encoded_Testing_dataset <- encoded_Testing_dataset[, -c(1,2,3)]
head(encoded_Testing_dataset)
## ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 1 5720 1 0 1 1
## 2 3076 1 0 1 1
## 3 5000 1 0 1 1
## 4 2340 NA 0 1 1
## 5 3276 1 1 0 0
## 6 2165 1 0 1 0
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 1 0 0 1 0
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 0
## 5 1 0 1 0
## 6 1 0 0 1
# NORMALIZE THE TESTING SET:
summary(encoded_Testing_dataset)
## ApplicantIncome Credit_History MarriedNo MarriedYes
## Min. : 0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 2864 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 3786 Median :1.0000 Median :0.0000 Median :1.0000
## Mean : 4806 Mean :0.8254 Mean :0.3651 Mean :0.6349
## 3rd Qu.: 5060 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :72529 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :29
## EducationGraduate EducationNot Graduate Self_Employed Self_EmployedNo
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.0000
## Median :1.0000 Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.7711 Mean :0.2289 Mean :0.06267 Mean :0.8365
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## Self_EmployedYes
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1008
## 3rd Qu.:0.0000
## Max. :1.0000
##
encoded_Testing_dataset_norm <- preProcess(encoded_Testing_dataset,method = c('range'))
normalized_Testing_dataset <- predict(encoded_Testing_dataset_norm,encoded_Testing_dataset)
head(normalized_Testing_dataset)
## ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 1 0.07886501 1 0 1 1
## 2 0.04241062 1 0 1 1
## 3 0.06893794 1 0 1 1
## 4 0.03226296 NA 0 1 1
## 5 0.04516814 1 1 0 0
## 6 0.02985013 1 0 1 0
## EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 1 0 0 1 0
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 0
## 5 1 0 1 0
## 6 1 0 0 1
summary(normalized_Testing_dataset)
## ApplicantIncome Credit_History MarriedNo MarriedYes
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.03949 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.05220 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.06626 Mean :0.8254 Mean :0.3651 Mean :0.6349
## 3rd Qu.:0.06977 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :29
## EducationGraduate EducationNot Graduate Self_Employed Self_EmployedNo
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.0000
## Median :1.0000 Median :0.0000 Median :0.00000 Median :1.0000
## Mean :0.7711 Mean :0.2289 Mean :0.06267 Mean :0.8365
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
##
## Self_EmployedYes
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1008
## 3rd Qu.:0.0000
## Max. :1.0000
##
# Set CRAN mirror
options(repos = c(CRAN = "https://cran.rstudio.com"))
# Install necessary packages
install.packages("readxl", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("class", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("e1071", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("caret", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("ggplot2", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("reshape2", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("pROC", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
# Load necessary libraries
library(readxl)
library(class)
library(e1071)
library(caret)
library(ggplot2)
library(reshape2)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
train_set <- normalized_class_training_dataset
test_set <- normalized_Testing_dataset
test_set <- test_set[,-7]
head(train_set)
## ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 2 0.05482993 1 0 1 1
## 3 0.03525046 1 0 1 1
## 4 0.03009276 1 0 1 0
## 5 0.07235622 1 1 0 1
## 6 0.06514533 1 0 1 1
## 7 0.02700062 1 0 1 0
## EducationNot Graduate Self_EmployedNo Self_EmployedYes Loan_Status
## 2 0 1 0 0
## 3 0 0 1 1
## 4 1 1 0 1
## 5 0 1 0 1
## 6 0 0 1 1
## 7 1 1 0 1
head(test_set)
## ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 1 0.07886501 1 0 1 1
## 2 0.04241062 1 0 1 1
## 3 0.06893794 1 0 1 1
## 4 0.03226296 NA 0 1 1
## 5 0.04516814 1 1 0 0
## 6 0.02985013 1 0 1 0
## EducationNot Graduate Self_EmployedNo Self_EmployedYes
## 1 0 1 0
## 2 0 1 0
## 3 0 1 0
## 4 0 1 0
## 5 1 1 0
## 6 1 0 1
test_set <- na.omit(test_set)
# Separate features and target variable in the training data
train_features <- train_set[, -which(names(train_set) == "Loan_Status")]
train_target <- train_set$Loan_Status
# Features in the testing data
test_features <- test_set
# Model's Performance when k=5
knn_predictions_k5 <- knn(train = train_features, test = test_features, cl = train_target, k = 5)
knn_predictions_k5
## [1] 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1
## [75] 1 0 0 0 1 0 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0
## [112] 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1
## [149] 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 0 1
## [186] 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 0 0 1
## [223] 1 1 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0
## [260] 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1
## [297] 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1
## [334] 1 0 1 1 1
## Levels: 0 1
# Model's Performance when k=7
knn_predictions_k7 <- knn(train = train_features, test = test_features, cl = train_target, k = 7)
knn_predictions_k7
## [1] 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1
## [75] 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0
## [112] 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1
## [149] 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1
## [186] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1
## [223] 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 0
## [260] 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1
## [297] 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1
## [334] 1 1 1 1 1
## Levels: 0 1
# Model's Performance when k=9
knn_predictions_k9 <- knn(train = train_features, test = test_features, cl = train_target, k = 9)
knn_predictions_k9
## [1] 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1
## [75] 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0
## [112] 1 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1
## [149] 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1
## [186] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1
## [223] 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 0
## [260] 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1
## [297] 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1
## [334] 1 1 1 1 1
## Levels: 0 1
# Create a validation set from the training data since the testing dataset does not have a target variable:
set.seed(123)
trainIndex <- createDataPartition(train_target, p = .8,
list = FALSE,
times = 1)
train_set_train <- train_set[trainIndex,]
train_set_val <- train_set[-trainIndex,]
# Separate features and target variable in the validation data
val_features <- train_set_val[, -which(names(train_set_val) == "Loan_Status")]
val_target <- train_set_val$Loan_Status
# Function to train and evaluate k-NN for a given k
evaluate_knn <- function(k) {
knn_val_predictions <- knn(train = train_set_train[, -which(names(train_set_train) == "Loan_Status")],
test = val_features,
cl = train_set_train$Loan_Status,
k = k)
# Convert predictions and actual values to factors with the same levels
val_target <- factor(val_target) # Ensure val_target is a factor
knn_val_predictions <- factor(knn_val_predictions, levels = levels(val_target))
# Confusion matrix for k-NN
knn_conf_matrix <- confusionMatrix(knn_val_predictions, val_target)
return(knn_conf_matrix)
}
# Evaluate k-NN for k=5, k=7, and k=9
knn_conf_matrix_k5 <- evaluate_knn(5)
knn_conf_matrix_k7 <- evaluate_knn(7)
knn_conf_matrix_k9 <- evaluate_knn(9)
# Print the confusion matrices
print(knn_conf_matrix_k5)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 15 7
## 1 15 68
##
## Accuracy : 0.7905
## 95% CI : (0.7001, 0.8638)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.04946
##
## Kappa : 0.442
##
## Mcnemar's Test P-Value : 0.13559
##
## Sensitivity : 0.5000
## Specificity : 0.9067
## Pos Pred Value : 0.6818
## Neg Pred Value : 0.8193
## Prevalence : 0.2857
## Detection Rate : 0.1429
## Detection Prevalence : 0.2095
## Balanced Accuracy : 0.7033
##
## 'Positive' Class : 0
##
print(knn_conf_matrix_k7)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 13 4
## 1 17 71
##
## Accuracy : 0.8
## 95% CI : (0.7107, 0.8717)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.030009
##
## Kappa : 0.4368
##
## Mcnemar's Test P-Value : 0.008829
##
## Sensitivity : 0.4333
## Specificity : 0.9467
## Pos Pred Value : 0.7647
## Neg Pred Value : 0.8068
## Prevalence : 0.2857
## Detection Rate : 0.1238
## Detection Prevalence : 0.1619
## Balanced Accuracy : 0.6900
##
## 'Positive' Class : 0
##
print(knn_conf_matrix_k9)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 13 4
## 1 17 71
##
## Accuracy : 0.8
## 95% CI : (0.7107, 0.8717)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.030009
##
## Kappa : 0.4368
##
## Mcnemar's Test P-Value : 0.008829
##
## Sensitivity : 0.4333
## Specificity : 0.9467
## Pos Pred Value : 0.7647
## Neg Pred Value : 0.8068
## Prevalence : 0.2857
## Detection Rate : 0.1238
## Detection Prevalence : 0.1619
## Balanced Accuracy : 0.6900
##
## 'Positive' Class : 0
##
# Confusion Matrix for k=5,7,9
plot_confusion_matrix <- function(cm, title) {
cm_matrix <- as.data.frame(cm$table)
colnames(cm_matrix) <- c("Prediction", "Reference", "Count")
ggplot(data = cm_matrix, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Count), color = "white") +
geom_text(aes(label = Count), vjust = 1) +
scale_fill_gradient(low = "white", high = "steelblue") +
theme_minimal() +
labs(title = title, x = "Actual", y = "Predicted")
}
# Plot confusion matrices for k=5, k=7, and k=9
plot_confusion_matrix(knn_conf_matrix_k5, "Confusion Matrix for k-NN (k=5)")
plot_confusion_matrix(knn_conf_matrix_k7, "Confusion Matrix for k-NN (k=7)")
plot_confusion_matrix(knn_conf_matrix_k9, "Confusion Matrix for k-NN (k=9)")
# Train the Naive Bayes model
nb_model <- naiveBayes(Loan_Status ~ ., data = train_set_train)
# Make predictions on the validation data
nb_val_predictions <- predict(nb_model, val_features)
# Convert predictions and actual values to factors with the same levels
val_target <- factor(val_target) # Ensure val_target is a factor
nb_val_predictions <- factor(nb_val_predictions, levels = levels(val_target))
# Confusion matrix for Naive Bayes
nb_conf_matrix <- confusionMatrix(nb_val_predictions, val_target)
print(nb_conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 13 5
## 1 17 70
##
## Accuracy : 0.7905
## 95% CI : (0.7001, 0.8638)
## No Information Rate : 0.7143
## P-Value [Acc > NIR] : 0.04946
##
## Kappa : 0.4167
##
## Mcnemar's Test P-Value : 0.01902
##
## Sensitivity : 0.4333
## Specificity : 0.9333
## Pos Pred Value : 0.7222
## Neg Pred Value : 0.8046
## Prevalence : 0.2857
## Detection Rate : 0.1238
## Detection Prevalence : 0.1714
## Balanced Accuracy : 0.6833
##
## 'Positive' Class : 0
##
# Plot Naive Bayes confusion matrix
plot_confusion_matrix(nb_conf_matrix, "Confusion Matrix for Naive Bayes")
# performance metrics from confusion matrix
calculate_metrics <- function(cm) {
accuracy <- cm$overall['Accuracy']
recall <- cm$byClass['Sensitivity']
precision <- cm$byClass['Pos Pred Value']
specificity <- cm$byClass['Specificity']
return(data.frame(Accuracy = accuracy, Recall = recall, Precision = precision, Specificity = specificity))
}
# Calculate metrics for k-NN with k=5, k=7, and k=9
metrics_k5 <- calculate_metrics(knn_conf_matrix_k5)
metrics_k7 <- calculate_metrics(knn_conf_matrix_k7)
metrics_k9 <- calculate_metrics(knn_conf_matrix_k9)
# Calculate metrics for Naive Bayes
metrics_nb <- calculate_metrics(nb_conf_matrix)
# Print metrics
print(metrics_k5)
## Accuracy Recall Precision Specificity
## Accuracy 0.7904762 0.5 0.6818182 0.9066667
print(metrics_k7)
## Accuracy Recall Precision Specificity
## Accuracy 0.8 0.4333333 0.7647059 0.9466667
print(metrics_k9)
## Accuracy Recall Precision Specificity
## Accuracy 0.8 0.4333333 0.7647059 0.9466667
print(metrics_nb)
## Accuracy Recall Precision Specificity
## Accuracy 0.7904762 0.4333333 0.7222222 0.9333333
all_metrics <- rbind(
data.frame(Model = "k-NN (k=5)", metrics_k5),
data.frame(Model = "k-NN (k=7)", metrics_k7),
data.frame(Model = "k-NN (k=9)", metrics_k9),
data.frame(Model = "Naive Bayes", metrics_nb)
)
print(all_metrics)
## Model Accuracy Recall Precision Specificity
## Accuracy k-NN (k=5) 0.7904762 0.5000000 0.6818182 0.9066667
## Accuracy1 k-NN (k=7) 0.8000000 0.4333333 0.7647059 0.9466667
## Accuracy2 k-NN (k=9) 0.8000000 0.4333333 0.7647059 0.9466667
## Accuracy3 Naive Bayes 0.7904762 0.4333333 0.7222222 0.9333333